{
 "cells": [
  {
   "cell_type": "markdown",
   "id": "ff182681",
   "metadata": {},
   "source": [
    "### PhyloP mean conservation score over gene body"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "cfd8fa24",
   "metadata": {},
   "outputs": [],
   "source": [
    "import pyBigWig\n",
    "import numpy as np\n",
    "import pandas as pd \n",
    "from pathlib import Path "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "652c06ce",
   "metadata": {},
   "outputs": [],
   "source": [
    "wkdir=\"/lustre/scratch126/humgen/projects/interval_rna/interval_rna_seq/thomasVDS/misexpression_v3/\"\n",
    "wkdir_path = Path(wkdir)\n",
    "\n",
    "inactive_genes_path = wkdir_path.joinpath('3_misexp_genes/bed_files/inactive_genes_phylop.bed')\n",
    "phylop_scores_bw = wkdir_path.joinpath(\"reference/conservation/phylop/hg38.phyloP100way.bw\")\n",
    "# output\n",
    "out_dir =wkdir_path.joinpath(\"3_misexp_genes/phylop\")\n",
    "out_dir.mkdir(parents=True, exist_ok=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "dfb29b2c",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Number of inactive genes: 8650\n"
     ]
    }
   ],
   "source": [
    "inactive_genes_df = pd.read_csv(inactive_genes_path, sep='\\t', header=None, names=['chrom', 'start', 'end', \"gene_id\"])\n",
    "inactive_genes = inactive_genes_df.gene_id.unique()\n",
    "print(f\"Number of inactive genes: {len(inactive_genes)}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "0477c58c",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/nfs/users/nfs_t/tv5/.conda/envs/tv5_base/lib/python3.7/site-packages/ipykernel_launcher.py:15: RuntimeWarning: Mean of empty slice\n",
      "  from ipykernel import kernelapp as app\n"
     ]
    }
   ],
   "source": [
    "phylop_bw = pyBigWig.open(str(phylop_scores_bw))\n",
    "phylop_consv_results = {}\n",
    "for index, row in inactive_genes_df.iterrows():\n",
    "    chrom = row['chrom']\n",
    "    start = row['start']\n",
    "    end = row['end']\n",
    "    gene_id = row[\"gene_id\"]\n",
    "\n",
    "    # query the BigWig file for the given interval\n",
    "    values = phylop_bw.values(chrom, start, end)\n",
    "    if len(values) != end - start: \n",
    "        print(gene_id)\n",
    "    \n",
    "    # calculate mean, ignores NaNs\n",
    "    mean_value = np.nanmean(values) if values is not None else None\n",
    "    \n",
    "    # add metrics to dictionary \n",
    "    phylop_consv_results[index] = [chrom, start, end, gene_id, mean_value]\n",
    "# close the BigWig file\n",
    "phylop_bw.close()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "02890992",
   "metadata": {},
   "outputs": [],
   "source": [
    "pylop_results_columns=[\"chrom\", \"start\", \"end\", \"gene_id\", \"phylop_mean\"]\n",
    "phylop_consv_results_df = pd.DataFrame.from_dict(phylop_consv_results, orient=\"index\", columns=pylop_results_columns)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "id": "20036653",
   "metadata": {},
   "outputs": [],
   "source": [
    "# write to file \n",
    "phylop_consv_results_path = out_dir.joinpath(\"inactive_gene_phylop_gene_body.tsv\")\n",
    "phylop_consv_results_df.to_csv(phylop_consv_results_path, sep=\"\\t\", index=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "8e839a1f",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.7"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
